import os
import json
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()
Based on Republic Act 8425, otherwise known as Social Reform and Poverty Alleviation Act, dated 11 December 1997, the poor refers to individuals and families whose income fall below the poverty threshold as defined by the government and/or those that cannot afford in a sustained manner to provide their basic needs of food, health, education, housing and other amenities of life.
Starting with the given metadata or the catalog of the dataset
df_metadata = pd.read_csv(r'data/csv/190710_poverty-statistics-metadata.csv').dropna()
df_metadata
| Field | Content | |
|---|---|---|
| 0 | Region Code | Modified PSGC code of the Region. This is a tw... |
| 1 | Region | Source of the PSGC code for the Region |
| 2 | Province Code | Modified PSGC code of the Province. This is a ... |
| 3 | Province | Source of the PSGC code for the Province |
| 4 | City_Mun Code | Modified PSGC code of the City/Municipality. T... |
| 5 | City_Municipality | Source of the PSGC code for the City/Municipal... |
| 7 | Glossary of terms | https://psa.gov.ph/poverty-press-releases/glos... |
| 8 | Poverty estimation methodoloy | For details of poverty estimation methodology,... |
| 9 | Poverty data | https://psa.gov.ph/poverty-sae-press-releases/... |
| 10 | Technical notes | https://psa.gov.ph/poverty-press-releases/tech... |
| 12 | Notes: | 1. "PH" in front of the PSGC coding scheme in ... |
Converting the metadata dataframe to dictionary for easy access of content
dict_metadata = dict(zip(df_metadata['Field'].values, df_metadata['Content'].values))
print(dict_metadata['Notes:'])
1. "PH" in front of the PSGC coding scheme in order to solve the problem of the "0" that falls in several application 2. The standard deviation of an estimate can be derived by multiplying the poverty incidence and coefficient of variation then divide by 100. 3. The Municipality of Bumbaran, Lanao del Sur was renamed as Municipality of Amai Manabilang per Muslim Mindanao Autonomy Acto No. 316, series of 2014 and was ratified through a plebiscite on 07 April 2018.
Reading the main dataset
df = pd.read_csv(r'data/csv/190710_poverty-statistics.csv')
df.head()
| Region | Region code | Province | Province code | Annual Per Capita Poverty Threshold \n(in Php)_2006 | Annual Per Capita Poverty Threshold \n(in Php)_2009 | Annual Per Capita Poverty Threshold \n(in Php)_2012 | Annual Per Capita Poverty Threshold \n(in Php)_2015 | Poverty Incidence among Families (%)_Est (%)_2006 | Poverty Incidence among Families (%)_Est (%)_2009 | Poverty Incidence among Families (%)_Est (%)_2012 | Poverty Incidence among Families (%)_Est (%)_2015 | Poverty Incidence among Families (%)_CV_2006 | Poverty Incidence among Families (%)_CV_2009 | Poverty Incidence among Families (%)_CV_2012 | Poverty Incidence among Families (%)_CV_2015 | Magnitude of Poor Families_Est_2006 | Magnitude of Poor Families_Est_2009 | Magnitude of Poor Families_Est_2012 | Magnitude of Poor Families_Est_2015 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NATIONAL CAPITAL REGION (NCR) | PH130000000 | NCR, CITY OF MANILA, FIRST DISTRICT (Not a Pro... | PH133900000 | 15,699 | 19,227 | 20,344 | 25,007 | 2.7 | 3.2 | 3.6 | 3.5 | 29.6 | 48.7 | 24.8 | 26.7 | 9,906 | 12,405 | 14,343 | 12,710 |
| 1 | NATIONAL CAPITAL REGION (NCR) | PH130000000 | NCR, SECOND DISTRICT (Not a Province)b/ | PH137400000 | 15,699 | 19,227 | 20,344 | 25,007 | 3.2 | 2.2 | 1.9 | 1.9 | 48.0 | 21.4 | 25.6 | 40.4 | 28,247 | 21,168 | 19,782 | 21,727 |
| 2 | NATIONAL CAPITAL REGION (NCR) | PH130000000 | NCR, THIRD DISTRICT (Not a Province) | PH137500000 | 15,699 | 19,227 | 20,344 | 25,007 | 3.3 | 3.2 | 2.8 | 3.3 | 19.5 | 23.9 | 23.7 | 19.6 | 18,631 | 19,306 | 18,266 | 22,352 |
| 3 | NATIONAL CAPITAL REGION (NCR) | PH130000000 | NCR, FOURTH DISTRICT (Not a Province)b/ | PH137600000 | 15,699 | 19,227 | 20,344 | 25,007 | 2.4 | 1.5 | 3.0 | 2.8 | 21.8 | 35.0 | 21.5 | 23.5 | 16,570 | 11,095 | 24,138 | 23,457 |
| 4 | CORDILLERA ADMINISTRATIVE REGION (CAR) | PH140000000 | ABRA | PH140100000 | 14,680 | 17,852 | 19,775 | 21,240 | 39.4 | 38.9 | 27.2 | 19.9 | 10.8 | 21.0 | 21.2 | 13.2 | 18,054 | 18,852 | 13,914 | 12,400 |
Including only the geographical information and the magnitude of poor families estimation by year column for later on easy to digest visualization
columns_to_include = [x for x in df.columns if not (x.startswith('Poverty') or x.startswith('Annual'))]
columns_to_include
['Region', 'Region code', 'Province', 'Province code', 'Magnitude of Poor Families_Est_2006', 'Magnitude of Poor Families_Est_2009', 'Magnitude of Poor Families_Est_2012', 'Magnitude of Poor Families_Est_2015']
Extracting the data with the columns to include and turning it to a dataframe
df = df[columns_to_include]
df.head()
| Region | Region code | Province | Province code | Magnitude of Poor Families_Est_2006 | Magnitude of Poor Families_Est_2009 | Magnitude of Poor Families_Est_2012 | Magnitude of Poor Families_Est_2015 | |
|---|---|---|---|---|---|---|---|---|
| 0 | NATIONAL CAPITAL REGION (NCR) | PH130000000 | NCR, CITY OF MANILA, FIRST DISTRICT (Not a Pro... | PH133900000 | 9,906 | 12,405 | 14,343 | 12,710 |
| 1 | NATIONAL CAPITAL REGION (NCR) | PH130000000 | NCR, SECOND DISTRICT (Not a Province)b/ | PH137400000 | 28,247 | 21,168 | 19,782 | 21,727 |
| 2 | NATIONAL CAPITAL REGION (NCR) | PH130000000 | NCR, THIRD DISTRICT (Not a Province) | PH137500000 | 18,631 | 19,306 | 18,266 | 22,352 |
| 3 | NATIONAL CAPITAL REGION (NCR) | PH130000000 | NCR, FOURTH DISTRICT (Not a Province)b/ | PH137600000 | 16,570 | 11,095 | 24,138 | 23,457 |
| 4 | CORDILLERA ADMINISTRATIVE REGION (CAR) | PH140000000 | ABRA | PH140100000 | 18,054 | 18,852 | 13,914 | 12,400 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 85 entries, 0 to 84 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Region 85 non-null object 1 Region code 85 non-null object 2 Province 85 non-null object 3 Province code 85 non-null object 4 Magnitude of Poor Families_Est_2006 85 non-null object 5 Magnitude of Poor Families_Est_2009 85 non-null object 6 Magnitude of Poor Families_Est_2012 85 non-null object 7 Magnitude of Poor Families_Est_2015 85 non-null object dtypes: object(8) memory usage: 5.4+ KB
Stripping all the columns of the all the possible leading and trailing white spaces
df = df.applymap(str.strip)
Stripping the province column of the trailing 'b/' characters
df['Province'] = df['Province'].str.rstrip(r'b/')
Checking the magnitude estimation columns of possible parsing problems
magnitude_pf_est_by_year_column_names = [x for x in columns_to_include if x.startswith('Magnitude')]
print("Columns with complication: ")
for column in magnitude_pf_est_by_year_column_names:
try:
pd.to_numeric(df[column].squeeze())
except:
print(f"\t{column}")
Columns with complication: Magnitude of Poor Families_Est_2006 Magnitude of Poor Families_Est_2009 Magnitude of Poor Families_Est_2012 Magnitude of Poor Families_Est_2015
for column in magnitude_pf_est_by_year_column_names:
print(column)
print(f"{df[column].squeeze().str.strip().sort_values().head(3)}\n")
Magnitude of Poor Families_Est_2006 19 10,211 30 11,576 9 11,920 Name: Magnitude of Poor Families_Est_2006, dtype: object Magnitude of Poor Families_Est_2009 52 10,701 68 100,860 13 103,487 Name: Magnitude of Poor Families_Est_2009, dtype: object Magnitude of Poor Families_Est_2012 81 100,946 46 102,924 59 104,133 Name: Magnitude of Poor Families_Est_2012, dtype: object Magnitude of Poor Families_Est_2015 14 - 45 1,594 75 10,570 Name: Magnitude of Poor Families_Est_2015, dtype: object
Replacing the comma with empty space and the dash with none type as to drop all of it with dropna() later on
def clean_df_columns(dataframe: pd.DataFrame, columns_to_clean: list):
for column in columns_to_clean:
df[column] = (
df[column]
.str.replace(',','')
.replace('-', None))
return df
Cleaning and parsing magnitude_pf_est_by_year_column_names columns to int64
df = (
clean_df_columns(
dataframe = df.convert_dtypes(),
columns_to_clean = magnitude_pf_est_by_year_column_names)
.dropna()
.astype(dict(zip(
magnitude_pf_est_by_year_column_names,
['int64' for x in range(len(magnitude_pf_est_by_year_column_names))]))) )
df.dtypes
Region object Region code object Province object Province code object Magnitude of Poor Families_Est_2006 int64 Magnitude of Poor Families_Est_2009 int64 Magnitude of Poor Families_Est_2012 int64 Magnitude of Poor Families_Est_2015 int64 dtype: object
df.describe()
| Magnitude of Poor Families_Est_2006 | Magnitude of Poor Families_Est_2009 | Magnitude of Poor Families_Est_2012 | Magnitude of Poor Families_Est_2015 | |
|---|---|---|---|---|
| count | 84.000000 | 84.000000 | 84.000000 | 84.000000 |
| mean | 45341.869048 | 48050.202381 | 50167.988095 | 44601.357143 |
| std | 37229.006994 | 38136.162504 | 40596.601751 | 38303.600739 |
| min | 3479.000000 | 3642.000000 | 3429.000000 | 1594.000000 |
| 25% | 15409.000000 | 15604.750000 | 17907.500000 | 14672.750000 |
| 50% | 39949.000000 | 43009.500000 | 39198.500000 | 32290.500000 |
| 75% | 61509.250000 | 69217.500000 | 75694.000000 | 63847.250000 |
| max | 209301.000000 | 200481.000000 | 185603.000000 | 179162.000000 |
geo_ph_regions = json.load(open(r'data/geojson/regions/regions.0.01.json'))
geo_ph_regions
Peeking to geojson contents and looking for key to map with the dataframe
geo_ph_regions['features']
df.head(5)
| Region | Region code | Province | Province code | Magnitude of Poor Families_Est_2006 | Magnitude of Poor Families_Est_2009 | Magnitude of Poor Families_Est_2012 | Magnitude of Poor Families_Est_2015 | |
|---|---|---|---|---|---|---|---|---|
| 0 | NATIONAL CAPITAL REGION (NCR) | PH130000000 | NCR, CITY OF MANILA, FIRST DISTRICT (Not a Pro... | PH133900000 | 9906 | 12405 | 14343 | 12710 |
| 1 | NATIONAL CAPITAL REGION (NCR) | PH130000000 | NCR, SECOND DISTRICT (Not a Province) | PH137400000 | 28247 | 21168 | 19782 | 21727 |
| 2 | NATIONAL CAPITAL REGION (NCR) | PH130000000 | NCR, THIRD DISTRICT (Not a Province) | PH137500000 | 18631 | 19306 | 18266 | 22352 |
| 3 | NATIONAL CAPITAL REGION (NCR) | PH130000000 | NCR, FOURTH DISTRICT (Not a Province) | PH137600000 | 16570 | 11095 | 24138 | 23457 |
| 4 | CORDILLERA ADMINISTRATIVE REGION (CAR) | PH140000000 | ABRA | PH140100000 | 18054 | 18852 | 13914 | 12400 |
df.to_csv(r'data/csv/poverty_statistics_clnd.csv')
def save_fig_binary(figure, file_name: str, folder_name: str, ) -> None:
"""Saves figure to binary format"""
data_root_path = f'data/bin/{folder_name}'
if not os.path.exists('data'):
os.mkdir('data')
if not os.path.exists('data/bin'):
os.mkdir('data/bin')
if not os.path.exists(data_root_path):
os.mkdir(data_root_path)
with open(f'{data_root_path}/{file_name}.bin', 'wb') as file:
pickle.dump(figure, file)
def load_fig_binary(file_name: str, folder_name: str):
"""Loads and returns binary figure"""
data_root_path = f'data/bin/{folder_name}'
if os.path.exists(f'{data_root_path}/{file_name}.bin'):
file = open(f'{data_root_path}/{file_name}.bin', 'rb')
figure = pickle.load(file)
file.close()
return figure
else:
return None
year_by_column_name = dict(zip(['2006', '2009', '2012', '2015'], magnitude_pf_est_by_year_column_names))
Making a function to automate workflow in plotting by region with year as the argument
def plot_by_region(dataframe, year: str):
file_name = 'by_region'
folder_name = 'regions'
fig = load_fig_binary(file_name = file_name, folder_name = folder_name)
if fig is not None:
return fig
else:
geojson_path = 'data/geojson'
geojson_folder_name = 'regions'
geojson_file_name = 'regions.0.01'
geojson_file = json.load(open(rf'{geojson_path}/{geojson_folder_name}/{geojson_file_name}.json'))
fig = px.choropleth(
data_frame = df,
geojson = geojson_file,
featureidkey = 'properties.ADM1_PCODE',
locations = 'Region code',
color = year_by_column_name[year],
scope = 'asia',
color_continuous_scale = px.colors.sequential.Reds,
custom_data = ['Region', 'Region code', year_by_column_name[year]])
fig.update_traces(
hovertemplate = '<br>'.join([
'Region: %{customdata[0]}',
'Region code: %{customdata[1]}',
'<b>Magnitude of poor families estimation: %{customdata[2]:,}</b>' ]),
)
fig.update_geos(fitbounds = 'locations', visible = False,)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
save_fig_binary(figure = fig, file_name = file_name, folder_name = folder_name)
return fig
Making a function to automate workflow in plotting by province with year as the argument
region_name_by_region_code = {}
for name, code in df[['Region', 'Region code']].value_counts().to_frame().index:
region_name_by_region_code.update({name: code})
region_name_by_region_code
{'REGION III (CENTRAL LUZON)': 'PH030000000',
'REGION VIII (EASTERN VISAYAS)': 'PH080000000',
'CORDILLERA ADMINISTRATIVE REGION (CAR)': 'PH140000000',
'REGION V (BICOL REGION)': 'PH050000000',
'REGION VI (WESTERN VISAYAS)': 'PH060000000',
'AUTONOMOUS REGION IN MUSLIM MINDANAO (ARMM)': 'PH150000000',
'MIMAROPA REGION': 'PH170000000',
'REGION XII (SOCCSKSARGEN)': 'PH120000000',
'REGION X (NORTHERN MINDANAO)': 'PH100000000',
'REGION IV-A (CALABARZON)': 'PH040000000',
'REGION XI (DAVAO REGION)': 'PH110000000',
'REGION IX (ZAMBOANGA PENINSULA)': 'PH090000000',
'REGION VII (CENTRAL VISAYAS)': 'PH070000000',
'REGION II (CAGAYAN VALLEY)': 'PH020000000',
'REGION I (ILOCOS REGION)': 'PH010000000',
'NATIONAL CAPITAL REGION (NCR)': 'PH130000000',
'REGION XIII (Caraga)': 'PH160000000'}
def plot_by_province(dataframe, year: str, region_name: str):
region_code = region_name_by_region_code[region_name]
file_name = f'by_region_{region_code}'
folder_name = 'provinces'
fig = load_fig_binary(file_name = file_name, folder_name = folder_name)
if fig is not None:
return fig
else:
dataframe = dataframe.loc[dataframe['Region'].isin([region_name])]
geojson_path = 'data/geojson'
geojson_folder_name = 'provinces'
geojson_file_name = f'provinces-region-{region_code.lower()}.0.01'
geojson_file = json.load(open(rf'{geojson_path}/{geojson_folder_name}/{geojson_file_name}.json'))
fig = px.choropleth(
data_frame = dataframe,
geojson = geojson_file,
featureidkey = 'properties.ADM2_PCODE',
locations = 'Province code',
color = year_by_column_name[year],
scope = 'asia',
color_continuous_scale = px.colors.sequential.Reds,
custom_data = ['Province', 'Province code', year_by_column_name[year]])
fig.update_traces(
hovertemplate = '<br>'.join([
'Province: %{customdata[0]}',
'Province code: %{customdata[1]}',
'<b>Magnitude of poor families estimation: %{customdata[2]:,}</b>' ]),
)
fig.update_geos(fitbounds = 'locations', visible = False,)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
save_fig_binary(figure = fig, file_name = file_name, folder_name = folder_name)
return fig
Magnitude of poor families estimation visualization by regions
plot_by_region(dataframe = df, year ='2006')
Magnitude of poor families estimation visualization by province
plot_by_province(dataframe = df, year = '2006', region_name = 'REGION IV-A (CALABARZON)')